Celestin Apprentice 5

home *** CD-ROM | disk | FTP | other *** search

/ Celestin Apprentice 5 / Apprentice-Release5.iso / Source Code / C / Applications / Python 1.3.3 / Python 133 SRC / Demo / www / waisqp.py < prev next >

Wrap

Python Source | 1996-03-12 | 9.2 KB | 438 lines | [TEXT/Pyth]

# Parser for WAIS question files. # The syntax is (I am really making this up, there is no decent grammar): # file: node # node: record | list # record: '(' keyword (keyword value)* ')' # list: '(' record* ')' # value: string | keyword | othertoken | node | '#' bytelist # bytelist: '(' number* ')' # Tokens are really almost anything, only strings are treated special; # keywords are tokens starting with ':'. import regex import string # Class representing a record. # This is accessed as if it is a dictionary. # Limited sequential access is also supported: "for k, v in r: ..." # class Record: # def __init__(self, type): self.type = type self.itemlist = [] # def __repr__(self): s = '(:' + self.type + '\n' for k, v in self.itemlist: v = str(v) if '\n' in v: lines = string.splitfields(v, '\n') v = string.joinfields(lines, '\n ') s = s + ' :' + k + ' ' + v + '\n' s = s + ')' return s # def __setitem__(self, keyword, value): for i in range(len(self.itemlist)): if keyword == self.itemlist[i][0]: self.itemlist[i] = (keyword, value) return self.itemlist.append((keyword, value)) # def __delitem__(self, keyword): for i in range(len(self.itemlist)): if keyword == self.itemlist[i][0]: del self.itemlist[i] return raise KeyError, 'keyword not in Record: ' + repr(keyword) # def __getitem__(self, keyword): if type(keyword) == type(0): # Sequence type access return self.itemlist[keyword] # Mapping type access for k, v in self.itemlist: if k == keyword: return v raise KeyError, 'keyword not in Record: ' + repr(keyword) # def __len__(self): return len(self.itemlist) # def keys(self): keys = [] for k, v in self.itemlist: keys.append(k) return keys # def has_key(self, keyword): for k, v in self.itemlist: if k == keyword: return 1 return 0 # def gettype(self): return self.type # Class representing a list of values. # class List: # def __init__(self, *args): self.list = [] for item in args: self.list.append(item) # def __repr__(self): s = '(\n' for item in self.list: item = str(item) if '\n' in item: lines = string.splitfields(item, '\n') item = string.joinfields(lines, '\n ') s = s + ' ' + item + '\n' s = s + ')' return s # def append(self, item): self.list.append(item) # def insert(self, i, item): self.list.insert(i, item) # def remove(self, item): self.list.remove(item) # def __len__(self): return len(self.list) # def __getitem__(self, i): return self.list[i] # def __setitem__(self, i, value): self.list[i] = value # def __delitem__(self, i): del self.list[i] # def __getslice__(self, i, j): new = List() for item in self.list[i:j]: new.append(item) return new # Class representing a list of bytes. # class BytesList: # def __init__(self): self.bytes = '' # def __repr__(self): s = '#(' for byte in self.bytes: s = s + ' ' + str(ord(byte)) s = s + ' )' return s # def append(self, value): try: i = string.atoi(value) except string.atoi_error: raise SyntaxError, (value, 'byte') try: c = chr(i) except ValueError: raise SyntaxError, (value, 'byte in 0..255') self.bytes = self.bytes + c # Regular expressions used by the tokenizer, and "compiled" versions # wspat = '\([ \t\n\r\f]+\|;.*\n\)*' tokenpat = '[()#"]\|[^()#"; \t\n\r\f]+' stringpat = '"\(\\\\.\|[^\\"]\)*"' # "\(\\.\|[^\"]\)*" wsprog = regex.compile(wspat) tokenprog = regex.compile(tokenpat) stringprog = regex.compile(stringpat) # Parser base class without look-ahead. # Instantiate each time you want to parse a file. # class RealBaseParser: # def __init__(self, input): # # 'input' should have a parameterless method readline() # which returns the next line, including trailing '\n', # or the empty string if there is no more data. # An open file will do nicely, as does an instance # of StringInput below. # self.input = input self.lineno = 0 # # Reset the scanner interface. # self.reset() # def reset(self): self.nextline = '' self.pos = 0 self.tokstart = 0 self.eofseen = 0 # # The real work of getting a token is done here. # This is the first place place to look if you think # the parser is too slow. # def getnexttoken(self): while 1: k = wsprog.match(self.nextline, self.pos) if k < 0: raise SyntaxError, ('', 'whitespace') self.pos = self.pos + k k = tokenprog.match(self.nextline, self.pos) if k >= 0: break # # End of line hit # if self.eofseen: self.nextline = '' else: self.nextline = self.input.readline() self.pos = self.tokstart = 0 if not self.nextline: if self.eofseen: raise EOFError self.eofseen = 1 return '' self.lineno = self.lineno + 1 # # Found a token # self.tokstart, self.pos = self.pos, self.pos + k token = self.nextline[self.tokstart:self.pos] if token == '"': # # Get the whole string -- may read more lines # k = stringprog.match(self.nextline, self.tokstart) while k < 0: cont = self.input.readline() if not cont: k = len(self.nextline) - self.tokstart break self.nextline = self.nextline + cont self.lineno = self.lineno + 1 k = stringprog.match(self.nextline, \ self.tokstart) self.pos = self.tokstart + k token = self.nextline[self.tokstart:self.pos] return token # # Default error handlers. # def reporterror(self, filename, message, fp): fp.write(filename) fp.write(':' + `self.lineno` + ': ') fp.write(message) fp.write('\n') self.printerrorline(fp) # def printerrorline(self, fp): line = self.nextline fp.write(line) if line[-1:] <> '\n': fp.write('\n') for i in range(len(line)): if i >= self.tokstart: n = max(1, self.pos - i) fp.write('^'*n) break elif line[i] == '\t': fp.write('\t') elif ' ' <= line[i] < '\177': fp.write(' ') fp.write('\n') # Parser base class. Instantiate each time you want to parse a file. # This supports a single token look-ahead. # class BaseParser(RealBaseParser): # def reset(self): RealBaseParser.reset(self) self.pushback = '' # def peektoken(self): if not self.pushback: self.pushback = self.getnexttoken() return self.pushback # def gettoken(self): if self.pushback: token = self.pushback self.pushback = '' else: token = self.getnexttoken() if token == '': raise EOFError return token # def ungettoken(self, token): if self.pushback: raise AssertError, 'more than one ungettoken' # print 'pushback:', token self.pushback = token # Parser for a node. Instantiate, and gell getnode() to parse a node. # class Parser(BaseParser): # # Parse a node. This is highly recursive. # def getnode(self): self.open() # This can be either a list or a record if self.peektoken() in ('(', ')'): # It's a list list = List() while self.more(): list.append(self.getnode()) self.close() return list # Not a list, must be a record type = self.getkeyword() rec = Record(type) while self.more(): keyword = self.getkeyword() value = self.getvalue() rec[keyword] = value self.close() return rec # def getkeyword(self): t = self.gettoken() if t[0] <> ':' or t == ':': raise SyntaxError, (t, ':<keyword>') return t[1:] # def getvalue(self): t = self.peektoken() if t == '(': return self.getnode() if t == '#': self.expect('#') return self.getbyteslist() if t == ')': raise SyntaxError, (t, '<value>') return self.gettoken() # def getbyteslist(self): bytes = BytesList() self.open() while self.more(): bytes.append(self.getbyte()) self.close() return bytes # def getbyte(self): return self.gettoken() # # Shorthands for frequently occurring parsing operations # def open(self): self.expect('(') # def close(self): self.expect(')') # def expect(self, exp): t = self.gettoken() if t <> exp: raise SyntaxError, (t, exp) # def more(self): if self.peektoken() == ')': return 0 else: return 1 # A class to parse from a string # class StringInput: # def __init__(self, string): self.string = string self.pos = 0 # def __repr__(self): return '<StringInput instance, string=' + `self.string` \ + ', pos=' + `self.pos` + '>' # def readline(self): string = self.string i = self.pos n = len(string) while i < n: if string[i] == '\n': i = i+1 break i = i+1 string = string[self.pos : i] self.pos = i return string # Convenience routines to parse a file # def parsefile(filename): f = open(filename, 'r') p = Parser(f) result = p.getnode() f.close() return result # def parse(f): p = Parser(f) return p.getnode() # Test driver for tokenizer -- reads from stdin # def testtokenizer(): import sys p = Parser(sys.stdin) try: while 1: p.gettoken() except EOFError: print 'EOF' except SyntaxError, msg: p.reporterror('<stdin>', 'Syntax error: ' + msg, sys.stderr) # Test driver for parser -- reads from stdin # def testparser(): import sys p = Parser(sys.stdin) try: x = p.getnode() except EOFError: print 'unexpected EOF at line', p.lineno return except SyntaxError, msg: if type(msg) == type(()): gotten, expected = msg msg = 'got ' + `gotten` + ', expected ' + `expected` p.reporterror('<stdin>', 'Syntax error: ' + msg, sys.stderr) return print x